VSURF

Author

Miguel Fudolig

library(tidyverse)
library(ggplot2)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)
library(ggRandomForests)
library(VSURF)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"),
         Religion=relevel(Religion,ref="None")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |> 
  mutate(across(c(`Family Respect`:`Togetherness`,`Close-knit Community`:`Community Trust`),~relevel(.x,ref="Strongly disagree"))) |> 
  mutate(across(c(`See Family`:`Helpful Friends`,`Discrimination`),~as.factor(.x)))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Family

rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)


set.seed(222)
imbal_index <- createDataPartition(rfdata$Family,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]
imbal <- ROSE::ROSE(Family~.,
                          data=imbal_train,
                          seed=3)$data

VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod

vsurf.mod |> summary()

 VSURF computation time: 39.2 secs 

 VSURF selected: 
    34 variables at thresholding step (in 6.7 secs)
    21 variables at interpretation step (in 6.2 secs)
    15 variables at prediction step (in 26.3 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
 [1] "Ethnicity"            "Age"                  "Religion"            
 [4] "English.Difficulties" "See.Family"           "Helpful.Family"      
 [7] "See.Friends"          "Close.Family"         "Helpful.Friends"     
[10] "Close.Friends"        "Discrimination"       "Religious.Attendance"
[13] "Gender"               "Religious.Importance" "Close.knit.Community"
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "Ethnicity"               "Age"                    
 [3] "Religion"                "English.Difficulties"   
 [5] "See.Family"              "Helpful.Family"         
 [7] "See.Friends"             "Close.Family"           
 [9] "Helpful.Friends"         "Close.Friends"          
[11] "Discrimination"          "Religious.Attendance"   
[13] "Gender"                  "Religious.Importance"   
[15] "English.Speaking"        "Full.Time.Employment"   
[17] "Close.knit.Community"    "Helpful.Community"      
[19] "Community.Shares.Values" "Community.Trust"        
[21] "Successful.Family"      
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.1599222

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
) |> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                  Variable Importance sd_Importance  fill
1                Ethnicity    0.07224       0.00141   red
2                      Age    0.04941       0.00120 black
3                 Religion    0.03771       0.00113 black
4     English.Difficulties    0.03751       0.00101 black
5               See.Family    0.03537       0.00073 black
6           Helpful.Family    0.03532       0.00097 black
7              See.Friends    0.03530       0.00073 black
8             Close.Family    0.03400       0.00069 black
9          Helpful.Friends    0.03387       0.00085 black
10           Close.Friends    0.03364       0.00087 black
11          Discrimination    0.03156       0.00115 black
12    Religious.Attendance    0.02780       0.00064 black
13                  Gender    0.02591       0.00098 black
14    Religious.Importance    0.02258       0.00088 black
15        English.Speaking    0.02221       0.00071 black
16    Full.Time.Employment    0.02172       0.00109 black
17    Close.knit.Community    0.02128       0.00084 black
18       Helpful.Community    0.02127       0.00074 black
19 Community.Shares.Values    0.02075       0.00084 black
20         Community.Trust    0.02038       0.00062 black
21       Successful.Family    0.02009       0.00085 black
22               Get.Along    0.01972       0.00080 black
23              Expression    0.01349       0.00064 black
24          Similar.Values    0.01305       0.00068 black
25            Togetherness    0.01194       0.00077 black
26        Dental.Insurance    0.01190       0.00094 black
27     Spend.Time.Together    0.01092       0.00060 black
28          Family.Respect    0.00913       0.00046 black
29              Feel.Close    0.00911       0.00057 black
30                   Trust    0.00891       0.00041 black
31           Income_median    0.00878       0.00043 black
32            Family.Pride    0.00649       0.00040 black
33                 Loyalty    0.00601       0.00037 black
34        Health.Insurance    0.00359       0.00024 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_family.png", width=8, height=12,units="in")

Prediction

pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")
Warning in confusionMatrix.default(data = pred$pred, reference = imbal_test[, :
Levels are not in the same order for reference and data. Refactoring data to
match.
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  110  84
       Yes  75 115
                                          
               Accuracy : 0.5859          
                 95% CI : (0.5349, 0.6357)
    No Information Rate : 0.5182          
    P-Value [Acc > NIR] : 0.004511        
                                          
                  Kappa : 0.1722          
                                          
 Mcnemar's Test P-Value : 0.525793        
                                          
            Sensitivity : 0.5779          
            Specificity : 0.5946          
         Pos Pred Value : 0.6053          
         Neg Pred Value : 0.5670          
             Prevalence : 0.5182          
         Detection Rate : 0.2995          
   Detection Prevalence : 0.4948          
      Balanced Accuracy : 0.5862          
                                          
       'Positive' Class : Yes             
                                          
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobj
Setting levels: control = No, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls > cases
pROC::auc(rocobj)
Area under the curve: 0.5862

Health Professionals

rfdata <- qol |> select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

set.seed(222)
imbal_index <- createDataPartition(rfdata$Heal.Professionals,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]
imbal <- ROSE::ROSE(Heal.Professionals~.,
                          data=imbal_train,
                          seed=3)$data

VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod

vsurf.mod |> summary()

 VSURF computation time: 16.8 secs 

 VSURF selected: 
    34 variables at thresholding step (in 6.4 secs)
    16 variables at interpretation step (in 5.6 secs)
    2 variables at prediction step (in 4.8 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "English.Speaking" "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "English.Speaking"     "Religion"             "Ethnicity"           
 [4] "English.Difficulties" "Dental.Insurance"     "See.Friends"         
 [7] "Close.Family"         "Helpful.Friends"      "See.Family"          
[10] "Close.Friends"        "Helpful.Family"       "Age"                 
[13] "Helpful.Community"    "Income_median"        "Community.Trust"     
[16] "Religious.Attendance"
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.1384565

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
) |> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                  Variable Importance sd_Importance  fill
1         English.Speaking    0.07557       0.00207 black
2                 Religion    0.06309       0.00155 black
3                Ethnicity    0.06005       0.00143   red
4     English.Difficulties    0.04423       0.00159 black
5         Dental.Insurance    0.04375       0.00248 black
6              See.Friends    0.04085       0.00105 black
7             Close.Family    0.03763       0.00113 black
8          Helpful.Friends    0.03613       0.00090 black
9               See.Family    0.03599       0.00073 black
10           Close.Friends    0.03330       0.00086 black
11          Helpful.Family    0.03172       0.00089 black
12                     Age    0.02875       0.00080 black
13       Helpful.Community    0.02442       0.00089 black
14           Income_median    0.02420       0.00173 black
15         Community.Trust    0.02263       0.00070 black
16    Religious.Attendance    0.02168       0.00042 black
17    Religious.Importance    0.02145       0.00066 black
18               Get.Along    0.02125       0.00092 black
19    Close.knit.Community    0.02055       0.00070 black
20 Community.Shares.Values    0.01886       0.00067 black
21        Health.Insurance    0.01233       0.00069 black
22          Family.Respect    0.01034       0.00053 black
23                   Trust    0.01006       0.00056 black
24          Similar.Values    0.00991       0.00047 black
25              Expression    0.00967       0.00037 black
26     Spend.Time.Together    0.00967       0.00048 black
27       Successful.Family    0.00954       0.00047 black
28              Feel.Close    0.00936       0.00042 black
29          Discrimination    0.00865       0.00043 black
30            Family.Pride    0.00845       0.00040 black
31            Togetherness    0.00797       0.00034 black
32                  Gender    0.00686       0.00031 black
33                 Loyalty    0.00666       0.00030 black
34    Full.Time.Employment    0.00605       0.00039 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  # coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_hp.png", width=8, height=12,units="in")

Prediction

pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")
Warning in confusionMatrix.default(data = pred$pred, reference = imbal_test[, :
Levels are not in the same order for reference and data. Refactoring data to
match.
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  100  73
       Yes  85 127
                                          
               Accuracy : 0.5896          
                 95% CI : (0.5386, 0.6392)
    No Information Rate : 0.5195          
    P-Value [Acc > NIR] : 0.003353        
                                          
                  Kappa : 0.176           
                                          
 Mcnemar's Test P-Value : 0.381512        
                                          
            Sensitivity : 0.6350          
            Specificity : 0.5405          
         Pos Pred Value : 0.5991          
         Neg Pred Value : 0.5780          
             Prevalence : 0.5195          
         Detection Rate : 0.3299          
   Detection Prevalence : 0.5506          
      Balanced Accuracy : 0.5878          
                                          
       'Positive' Class : Yes             
                                          
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobj
Setting levels: control = No, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls > cases
pROC::auc(rocobj)
Area under the curve: 0.5878

Physical Check-up

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]

imbal <- ROSE::ROSE(Physical.Check.up~.,
                          data=imbal_train,
                          seed=3)$data

VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod

vsurf.mod |> summary()

 VSURF computation time: 37.3 secs 

 VSURF selected: 
    34 variables at thresholding step (in 6 secs)
    26 variables at interpretation step (in 5.1 secs)
    18 variables at prediction step (in 26.2 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
 [1] "Dental.Insurance"     "Age"                  "Health.Insurance"    
 [4] "Ethnicity"            "Helpful.Family"       "Religion"            
 [7] "Gender"               "See.Family"           "EnglishDiff"         
[10] "See.Friends"          "Income_median"        "Close.Family"        
[13] "Helpful.Friends"      "Close.Friends"        "Religious.Importance"
[16] "EnglishSpeak"         "Helpful.Community"    "Spend.Time.Together" 
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "Dental.Insurance"        "Age"                    
 [3] "Health.Insurance"        "Ethnicity"              
 [5] "Helpful.Family"          "Religion"               
 [7] "Gender"                  "See.Family"             
 [9] "EnglishDiff"             "See.Friends"            
[11] "Income_median"           "Close.Family"           
[13] "Helpful.Friends"         "Close.Friends"          
[15] "Religious.Importance"    "EnglishSpeak"           
[17] "Close.knit.Community"    "Helpful.Community"      
[19] "Religious.Attendance"    "Community.Shares.Values"
[21] "Get.Along"               "Community.Trust"        
[23] "Discrimination"          "Spend.Time.Together"    
[25] "Employment"              "Expression"             
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.09121094

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
) |> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                  Variable Importance sd_Importance  fill
1         Dental.Insurance    0.07270       0.00199 black
2                      Age    0.06376       0.00151 black
3         Health.Insurance    0.05786       0.00155 black
4                Ethnicity    0.05548       0.00123   red
5           Helpful.Family    0.03567       0.00110 black
6                 Religion    0.03456       0.00102 black
7                   Gender    0.03405       0.00095 black
8               See.Family    0.03347       0.00078 black
9              EnglishDiff    0.03337       0.00113 black
10             See.Friends    0.03066       0.00084 black
11           Income_median    0.02859       0.00136 black
12            Close.Family    0.02760       0.00063 black
13         Helpful.Friends    0.02711       0.00068 black
14           Close.Friends    0.02547       0.00070 black
15    Religious.Importance    0.02493       0.00086 black
16            EnglishSpeak    0.02400       0.00107 black
17    Close.knit.Community    0.02199       0.00068 black
18       Helpful.Community    0.02002       0.00097 black
19    Religious.Attendance    0.01943       0.00059 black
20 Community.Shares.Values    0.01868       0.00077 black
21               Get.Along    0.01606       0.00076 black
22         Community.Trust    0.01538       0.00061 black
23          Discrimination    0.01490       0.00080 black
24     Spend.Time.Together    0.01166       0.00054 black
25              Employment    0.01088       0.00077 black
26              Expression    0.00994       0.00040 black
27          Similar.Values    0.00927       0.00033 black
28                   Trust    0.00792       0.00039 black
29              Feel.Close    0.00734       0.00038 black
30            Family.Pride    0.00691       0.00041 black
31          Family.Respect    0.00682       0.00025 black
32            Togetherness    0.00646       0.00029 black
33       Successful.Family    0.00599       0.00026 black
34                 Loyalty    0.00586       0.00017 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  # coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  

plot(importance_plot)

ggsave(filename = "VSURF_importance_PC.png", width=8, height=12,units="in")

Prediction

pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")
Warning in confusionMatrix.default(data = pred$pred, reference = imbal_test[, :
Levels are not in the same order for reference and data. Refactoring data to
match.
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    71  69
       Yes  51 191
                                          
               Accuracy : 0.6859          
                 95% CI : (0.6367, 0.7321)
    No Information Rate : 0.6806          
    P-Value [Acc > NIR] : 0.4372          
                                          
                  Kappa : 0.3047          
                                          
 Mcnemar's Test P-Value : 0.1207          
                                          
            Sensitivity : 0.7346          
            Specificity : 0.5820          
         Pos Pred Value : 0.7893          
         Neg Pred Value : 0.5071          
             Prevalence : 0.6806          
         Detection Rate : 0.5000          
   Detection Prevalence : 0.6335          
      Balanced Accuracy : 0.6583          
                                          
       'Positive' Class : Yes             
                                          
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobj
Setting levels: control = 0, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls > cases
pROC::auc(rocobj)
Area under the curve: 0.6583

Dental Check-up

rfdata <- qol |> select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]
imbal <- ROSE::ROSE(Dentist.Check.up~.,
                          data=imbal_train,
                          seed=3)$data

VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod

vsurf.mod |> summary()

 VSURF computation time: 39.9 secs 

 VSURF selected: 
    34 variables at thresholding step (in 6.2 secs)
    31 variables at interpretation step (in 5.1 secs)
    14 variables at prediction step (in 28.6 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
 [1] "Dental.Insurance"     "Religion"             "See.Friends"         
 [4] "Close.Friends"        "Close.Family"         "Helpful.Friends"     
 [7] "English.Speaking"     "Helpful.Family"       "English.Difficulties"
[10] "Religious.Importance" "Health.Insurance"     "Community.Trust"     
[13] "Successful.Family"    "Get.Along"           
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "Dental.Insurance"        "Ethnicity"              
 [3] "Age"                     "Income_median"          
 [5] "Religion"                "See.Family"             
 [7] "See.Friends"             "Close.Friends"          
 [9] "Close.Family"            "Helpful.Friends"        
[11] "English.Speaking"        "Helpful.Family"         
[13] "English.Difficulties"    "Religious.Importance"   
[15] "Health.Insurance"        "Community.Trust"        
[17] "Community.Shares.Values" "Gender"                 
[19] "Close.knit.Community"    "Religious.Attendance"   
[21] "Successful.Family"       "Get.Along"              
[23] "Helpful.Community"       "Similar.Values"         
[25] "Feel.Close"              "Spend.Time.Together"    
[27] "Expression"              "Full.Time.Employment"   
[29] "Family.Respect"          "Togetherness"           
[31] "Loyalty"                
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.1140248

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
) |> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                  Variable Importance sd_Importance  fill
1         Dental.Insurance    0.11404       0.00189 black
2                Ethnicity    0.05543       0.00167   red
3                      Age    0.04809       0.00148 black
4            Income_median    0.04399       0.00227 black
5                 Religion    0.04101       0.00122 black
6               See.Family    0.03607       0.00098 black
7              See.Friends    0.03576       0.00062 black
8            Close.Friends    0.03027       0.00065 black
9             Close.Family    0.03005       0.00072 black
10         Helpful.Friends    0.02874       0.00102 black
11        English.Speaking    0.02806       0.00101 black
12          Helpful.Family    0.02748       0.00082 black
13    English.Difficulties    0.02480       0.00098 black
14    Religious.Importance    0.02143       0.00064 black
15        Health.Insurance    0.02010       0.00126 black
16         Community.Trust    0.01850       0.00075 black
17 Community.Shares.Values    0.01781       0.00063 black
18                  Gender    0.01712       0.00114 black
19    Close.knit.Community    0.01666       0.00054 black
20    Religious.Attendance    0.01605       0.00043 black
21       Successful.Family    0.01385       0.00067 black
22               Get.Along    0.01369       0.00040 black
23       Helpful.Community    0.01285       0.00046 black
24          Similar.Values    0.01165       0.00047 black
25              Feel.Close    0.01049       0.00043 black
26     Spend.Time.Together    0.00915       0.00045 black
27              Expression    0.00899       0.00053 black
28    Full.Time.Employment    0.00850       0.00073 black
29          Family.Respect    0.00786       0.00045 black
30            Togetherness    0.00671       0.00028 black
31                 Loyalty    0.00666       0.00038 black
32          Discrimination    0.00645       0.00042 black
33                   Trust    0.00622       0.00039 black
34            Family.Pride    0.00529       0.00029 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  # coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  

plot(importance_plot)

ggsave(filename = "VSURF_importance_Dc.png", width=8, height=12,units="in")

Prediction

pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")
Warning in confusionMatrix.default(data = pred$pred, reference = imbal_test[, :
Levels are not in the same order for reference and data. Refactoring data to
match.
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   117  74
       Yes  40 151
                                         
               Accuracy : 0.7016         
                 95% CI : (0.6529, 0.747)
    No Information Rate : 0.589          
    P-Value [Acc > NIR] : 3.417e-06      
                                         
                  Kappa : 0.4031         
                                         
 Mcnemar's Test P-Value : 0.001997       
                                         
            Sensitivity : 0.6711         
            Specificity : 0.7452         
         Pos Pred Value : 0.7906         
         Neg Pred Value : 0.6126         
             Prevalence : 0.5890         
         Detection Rate : 0.3953         
   Detection Prevalence : 0.5000         
      Balanced Accuracy : 0.7082         
                                         
       'Positive' Class : Yes            
                                         
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobj
Setting levels: control = 0, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls > cases
pROC::auc(rocobj)
Area under the curve: 0.7082

Folkmedicine

rfdata <- qol |> select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
imbal_test <- rfdata[-imbal_index,]
imbal <- ROSE::ROSE(Folkmedicine~.,
                          data=imbal_train,
                          seed=3)$data

VSURF(imbal[,2:ncol(imbal)],imbal[,1],parallel=T,verbose=F)->vsurf.mod

vsurf.mod |> summary()

 VSURF computation time: 38.8 secs 

 VSURF selected: 
    34 variables at thresholding step (in 5.7 secs)
    31 variables at interpretation step (in 6.1 secs)
    15 variables at prediction step (in 27 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
 [1] "Ethnicity"               "Helpful.Friends"        
 [3] "See.Family"              "Close.Friends"          
 [5] "English.Speaking"        "Helpful.Family"         
 [7] "Close.Family"            "English.Difficulties"   
 [9] "Religious.Importance"    "Helpful.Community"      
[11] "Religious.Attendance"    "Community.Shares.Values"
[13] "Get.Along"               "Family.Pride"           
[15] "Feel.Close"             
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "Ethnicity"               "Age"                    
 [3] "Religion"                "See.Friends"            
 [5] "Helpful.Friends"         "See.Family"             
 [7] "Close.Friends"           "English.Speaking"       
 [9] "Helpful.Family"          "Close.Family"           
[11] "English.Difficulties"    "Religious.Importance"   
[13] "Helpful.Community"       "Religious.Attendance"   
[15] "Community.Shares.Values" "Close.knit.Community"   
[17] "Get.Along"               "Community.Trust"        
[19] "Family.Pride"            "Spend.Time.Together"    
[21] "Feel.Close"              "Similar.Values"         
[23] "Full.Time.Employment"    "Gender"                 
[25] "Loyalty"                 "Family.Respect"         
[27] "Expression"              "Discrimination"         
[29] "Successful.Family"       "Trust"                  
[31] "Dental.Insurance"       
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.03628289

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)  |> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                  Variable Importance sd_Importance  fill
1                Ethnicity    0.14026       0.00194   red
2                      Age    0.07429       0.00149 black
3                 Religion    0.07297       0.00238 black
4              See.Friends    0.07109       0.00183 black
5          Helpful.Friends    0.05528       0.00142 black
6               See.Family    0.05156       0.00140 black
7            Close.Friends    0.05120       0.00110 black
8         English.Speaking    0.05071       0.00194 black
9           Helpful.Family    0.04559       0.00098 black
10            Close.Family    0.04244       0.00109 black
11    English.Difficulties    0.03898       0.00096 black
12    Religious.Importance    0.03875       0.00138 black
13       Helpful.Community    0.03472       0.00096 black
14    Religious.Attendance    0.03128       0.00106 black
15 Community.Shares.Values    0.03101       0.00137 black
16    Close.knit.Community    0.02941       0.00073 black
17               Get.Along    0.02930       0.00095 black
18         Community.Trust    0.02660       0.00086 black
19            Family.Pride    0.02386       0.00160 black
20     Spend.Time.Together    0.02028       0.00070 black
21              Feel.Close    0.02028       0.00139 black
22          Similar.Values    0.01895       0.00072 black
23    Full.Time.Employment    0.01841       0.00127 black
24                  Gender    0.01679       0.00091 black
25                 Loyalty    0.01669       0.00098 black
26          Family.Respect    0.01572       0.00100 black
27              Expression    0.01547       0.00062 black
28          Discrimination    0.01476       0.00083 black
29       Successful.Family    0.01404       0.00070 black
30                   Trust    0.01356       0.00083 black
31        Dental.Insurance    0.01147       0.00041 black
32            Togetherness    0.01106       0.00041 black
33           Income_median    0.00966       0.00064 black
34        Health.Insurance    0.00565       0.00033 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  # coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  

plot(importance_plot)

ggsave(filename = "VSURF_importance_Alt.png", width=8, height=12,units="in")

Prediction

pred<- predict(vsurf.mod,newdata=imbal_test[,-1])
caret::confusionMatrix(data=pred$pred,reference=imbal_test[,1],positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   291  43
       Yes  37   8
                                          
               Accuracy : 0.7889          
                 95% CI : (0.7443, 0.8289)
    No Information Rate : 0.8654          
    P-Value [Acc > NIR] : 1.0000          
                                          
                  Kappa : 0.0464          
                                          
 Mcnemar's Test P-Value : 0.5762          
                                          
            Sensitivity : 0.15686         
            Specificity : 0.88720         
         Pos Pred Value : 0.17778         
         Neg Pred Value : 0.87126         
             Prevalence : 0.13456         
         Detection Rate : 0.02111         
   Detection Prevalence : 0.11873         
      Balanced Accuracy : 0.52203         
                                          
       'Positive' Class : Yes             
                                          
pROC::roc(imbal_test[,1] |> as.ordered(),pred$pred |> as.ordered()) -> rocobj
Setting levels: control = 0, case = Yes
Warning in value[[3L]](cond): Ordered predictor converted to numeric vector.
Threshold values will not correspond to values in predictor.
Setting direction: controls < cases
pROC::auc(rocobj)
Area under the curve: 0.522